setwd("~/Google Drive/Vibrant/ALS Project - Chao Gao/Paper outline/Code/V1.6")
options( java.parameters = "-Xmx12g" )
Sys.getenv("JAVA_HOME")
install.packages(c("dplyr","plyr","mi","reshape","reshape2","ggplot2","MASS","magrittr","psych","bartMachine","SuperLearner", "caret","e1071","rJava"))
library(dplyr)
library(plyr)
library(mi)
library(reshape)
library(reshape2)
library(ggplot2)
library(MASS)
library(magrittr)
library(psych)
library(bartMachine)
library(SuperLearner)
library(caret)
library(e1071)
##Replace missing with the median of the column
fun<-function(x){
x[is.na(x)] =median(x, na.rm=TRUE) #convert the item with NA to median value from the column
x #display the column
}
### Load dataset #### Load raw PRO-ACT training dataset (3103974 obs. of 6 variables) ####
df <- read.table("Data/all_forms_PROACT.txt", header=TRUE, sep="|", na.strings=c("NA","","NaN"), fill = TRUE, quote = "", stringsAsFactors = FALSE, comment.char="")
length(unique(df$feature_name))#6318
## [1] 6318
length(unique(df$SubjectID))#8635
## [1] 8635
df_slope <- read.table("Data/ALSFRS_slope_PROACT.txt", header=TRUE, sep="|", na.strings=c("NA","","NaN"), fill = TRUE, quote = "", stringsAsFactors = FALSE, comment.char="")
#remove study subjects with NA in ALSFRS_slope
df_slope <- df_slope[!is.na(df_slope$ALSFRS_slope), ]
df_v <- read.table("Data/all_forms_validate_spike.txt", header=TRUE, sep="|", na.strings=c("NA","","NaN"), fill = TRUE, quote = "", stringsAsFactors = FALSE, comment.char="")
length(unique(df_v$SubjectID))#200
## [1] 200
length(unique(df_v$feature_name))#768
## [1] 768
df_v_slope <- read.table("Data/ALSFRS_slope_validate_spike.txt",header=TRUE,sep="|", na.strings=c("NA","","NaN"), fill = TRUE, quote = "", stringsAsFactors = FALSE, comment.char="")
This figure shows the missingness of orgininal training dataset
##read data
df0 <- df
#remove Adverse Event
df1 <- df0[df0$form_name != "Adverse Event", ]
# select features
df1 <- df1[df1$feature_name %in% c("Age","Gender", "onset_delta" ,"onset_site","ALSFRS_Total","mouth","hands", "trunk","leg","respiratory", "fvc_percent","bp_diastolic","bp_systolic", "BMI", "pulse", "if_use_Riluzole", "Urine Ph","Albumin", "Protein", "Sodium", "Potassium","Bicarbonate", "Chloride", "Blood Urea Nitrogen (BUN)", "Creatinine","Alkaline Phosphatase","ALT(SGPT)", "Gamma-glutamyltransferase", "AST(SGOT)", "Bilirubin (total)", "White Blood Cell (WBC)", "Neutrophils","Lymphocytes", "Monocytes", "Eosinophils","Basophils","Red Blood Cells (RBC)","Hemoglobin", "Hematocrit","Platelets","CK","Triglycerides", "Total Cholesterol", "Glucose","HbA1c (Glycated Hemoglobin)", "Calcium", "Phosphorus", "Race"),]
### Organized the data
# change "feature delta" as numeric feature
df1$feature_delta<-as.numeric(df1$feature_delta)
#Organized the data format
colnames(df1)[1]<-"SubjectID"
#Read slope values
slope <- df_slope
dff<-merge(slope,df1,by="SubjectID",all.x=TRUE)
feature_lst <- unique(dff$feature_name)
list_length<-length(feature_lst)
lengthh<-c(1:list_length)
IID<-order(dff$feature_delta)
dff<-dff[IID,]
#deal with constant numeric features
constant_id<-c(30,37,41,45)
for (i in constant_id){
aaa<-dff[which(dff$feature_name==feature_lst[i]),]
colnames(aaa)[1]<-"SubjectID"
ID_lst <- unique(aaa$ID)
aaa$feature_value<-as.numeric(as.factor(aaa$feature_value))
cof<-ddply(aaa,~SubjectID,summarise,mean=mean(feature_value) )
colnames(cof)[2]<-paste(feature_lst[i],"mean",sep="_")
slope<-merge(slope,cof,by="SubjectID",all=TRUE)
}
# deal with constant categorical feature
for (i in c(32,11)){
aaa<-dff[which(dff$feature_name==feature_lst[i]),]
colnames(aaa)[1]<-"SubjectID"
ID_lst <- unique(aaa$ID)
aaa$feature_value<-as.numeric(aaa$feature_value)
cof<-ddply(aaa,~SubjectID,summarise,mean=mean(feature_value) )
colnames(cof)[2]<-paste(feature_lst[i],"mean",sep="_")
slope<-merge(slope,cof,by="SubjectID",all=TRUE)
}
# deal with time-varying features
for (i in lengthh[-c(30,32,37,41,45,11)]){
aaa<-dff[which(dff$feature_name==feature_lst[i]),]
aaa<-aaa[,-6]
colnames(aaa)[1]<-"SubjectID"
ID_lst <- unique(aaa$ID)
aaa$feature_value<-as.numeric(aaa$feature_value)
cof<-ddply(aaa,~SubjectID,summarise,max=max(feature_value),min=min(feature_value),
median=median(feature_value)
)
aaa<-na.omit(aaa)
mods = dlply(aaa, .(SubjectID), lm, formula = feature_value ~ feature_delta)
coefs = ldply(mods, coef)
cof<-merge(cof,coefs[,c(1,3)],by="SubjectID",all.x=TRUE)
colnames(cof)[2:5]<-paste(feature_lst[i],c("max","min","median","slope"),sep="_")
slope<-merge(slope,cof,by="SubjectID",all=TRUE)
}
# write dataset into local directory
write.csv(slope,"slope_raw_training_new.csv", row.names = FALSE)
#172 covariates(including SubjectID)
df0_v <- df_v
#remove Adverse Event
df1_v <- df0_v[df0_v$form_name != "Adverse Event", ]
df1_v <- df1_v[df1_v$feature_name %in% c("Age","Gender", "onset_delta" ,"onset_site","ALSFRS_Total","mouth","hands","trunk","leg","respiratory", "fvc_percent","bp_diastolic","bp_systolic", "BMI",
"pulse", "if_use_Riluzole", "Urine Ph","Albumin", "Protein", "Sodium", "Potassium","Bicarbonate", "Chloride","Blood Urea Nitrogen (BUN)", "Creatinine","Alkaline Phosphatase","ALT(SGPT)", "Gamma-glutamyltransferase", "AST(SGOT)","Bilirubin (total)", "White Blood Cell (WBC)", "Neutrophils","Lymphocytes", "Monocytes","Eosinophils","Basophils","Red Blood Cells (RBC)","Hemoglobin", "Hematocrit","Platelets","CK","Triglycerides","Total Cholesterol", "Glucose","HbA1c (Glycated Hemoglobin)", "Calcium", "Phosphorus", "Race"),]
df1<-df1_v
df1$feature_delta<-as.numeric(df1$feature_delta)
colnames(df1)[1]<-"SubjectID"
slope<- df_v_slope
dff<-merge(slope,df1,by="SubjectID",all.x=TRUE)
feature_lst <- unique(dff$feature_name)
list_length<-length(feature_lst)
lengthh<-c(1:list_length)
IID<-order(dff$feature_delta)
dff<-dff[IID,]
## deal with constant numeric features
constant_id<-c(7,12,13,4)
for (i in constant_id){
aaa<-dff[which(dff$feature_name==feature_lst[i]),]
colnames(aaa)[1]<-"SubjectID"
ID_lst <- unique(aaa$ID)
aaa$feature_value<-as.numeric(as.factor(aaa$feature_value))
cof<-ddply(aaa,~SubjectID,summarise,mean=mean(feature_value) )
colnames(cof)[2]<-paste(feature_lst[i],"mean",sep="_")
slope<-merge(slope,cof,by="SubjectID",all=TRUE)
}
#deal with constant categorical features
for (i in c(8,11)){
aaa<-dff[which(dff$feature_name==feature_lst[i]),]
colnames(aaa)[1]<-"SubjectID"
ID_lst <- unique(aaa$ID)
aaa$feature_value<-as.numeric(aaa$feature_value)
cof<-ddply(aaa,~SubjectID,summarise,mean=mean(feature_value) )
colnames(cof)[2]<-paste(feature_lst[i],"mean",sep="_")
slope<-merge(slope,cof,by="SubjectID",all=TRUE)
}
#deal with time-varying features
for (i in lengthh[-c(7,12,13,4,8,11)]) {
aaa<-dff[which(dff$feature_name==feature_lst[i]),]
aaa<-aaa[,-6]
colnames(aaa)[1]<-"SubjectID"
ID_lst <- unique(aaa$ID)
aaa$feature_value<-as.numeric(aaa$feature_value)
cof<-ddply(aaa,~SubjectID,summarise,max=max(feature_value),min=min(feature_value),
median=median(feature_value)
)
aaa<-na.omit(aaa)
mods = dlply(aaa, .(SubjectID), lm, formula = feature_value ~ feature_delta)
coefs = ldply(mods, coef)
cof<-merge(cof,coefs[,c(1,3)],by="SubjectID",all.x=TRUE)
colnames(cof)[2:5]<-paste(feature_lst[i],c("max","min","median","slope"),sep="_")
slope<-merge(slope,cof,by="SubjectID",all=TRUE)
}
# write file
write.csv(slope,"slope_raw_validate_new.csv", row.names = FALSE)
## quartz_off_screen
## 2
Figure 3 shows the disconcruency of the missing pattern between training and testing dataset
| Var | n.x | mean.x | sd.x | n.y | mean.y | sd.y |
|---|---|---|---|---|---|---|
| Age_mean | 2424 | 54.5375412541254 | 11.4590297077509 | 101 | 57.2573714905737 | 10.7192613017683 |
| Albumin_max | 2109 | 47.0136320531057 | 3.32140960354093 | 72 | 45.2361111111111 | 3.18222641325644 |
| Albumin_median | 2109 | 43.9532954006638 | 2.72994017117937 | 72 | 42.1666666666667 | 3.21089693931923 |
| Albumin_min | 2109 | 40.7584589853011 | 3.28539649348497 | 72 | 38.9027777777778 | 4.14239408977371 |
| Albumin_slope | 2080 | -0.00066304524359485 | 0.00964402390052194 | 71 | -0.00186345739650714 | 0.00971895029920851 |
| Alkaline.Phosphatase_max | 2109 | 106.724513987672 | 61.7278755340277 | 37 | 85.945945945946 | 41.41855860598 |
| Alkaline.Phosphatase_median | 2109 | 78.4561403508772 | 22.5725342855288 | 37 | 66.6216216216216 | 20.9862406661436 |
| Alkaline.Phosphatase_min | 2109 | 65.918018018018 | 18.8045151240071 | 37 | 57.0540540540541 | 17.7418932128106 |
| Alkaline.Phosphatase_slope | 2079 | 0.0245522287073217 | 0.107121072718988 | 36 | 0.0178481107415077 | 0.0454421655158345 |
| ALSFRS_slope | 2424 | -0.730785587584841 | 0.62748149433062 | 101 | -0.785880065754675 | 0.655454936567462 |
| ALSFRS_Total_max | 2424 | 31.7438118811881 | 5.27392088581054 | 101 | 30.0990099009901 | 5.96239037718103 |
| ALSFRS_Total_median | 2424 | 27.1917285478548 | 6.60869643850714 | 101 | 22.460396039604 | 8.62661091284313 |
| ALSFRS_Total_min | 2424 | 20.1295379537954 | 8.59290047306708 | 101 | 15.6039603960396 | 9.31029452586844 |
| ALSFRS_Total_slope | 2424 | -0.0232826672515287 | 0.017266353009286 | 101 | -0.0251154776907436 | 0.0171240457379447 |
| ALT.SGPT._max | 2412 | 53.9509950248756 | 43.8185743794078 | 78 | 59.0769230769231 | 34.5497079247035 |
| ALT.SGPT._median | 2412 | 32.9956467661692 | 15.5289390716099 | 78 | 32.3974358974359 | 14.8204314426129 |
| ALT.SGPT._min | 2412 | 23.174087893864 | 11.3071845494832 | 78 | 22.3589743589744 | 10.7413200946605 |
| ALT.SGPT._slope | 2384 | 0.00127031578973447 | 0.0806063706450099 | 77 | -0.0176662874687794 | 0.0782988486519406 |
| AST.SGOT._max | 2413 | 42.9656029838375 | 34.1855132163785 | 78 | 48.3333333333333 | 32.3907420902976 |
| AST.SGOT._median | 2413 | 29.1918773311231 | 9.60296304111526 | 78 | 28.8717948717949 | 9.35845887850946 |
| AST.SGOT._min | 2413 | 21.7508910070452 | 7.50312015209157 | 78 | 21.0897435897436 | 6.26510827931822 |
| AST.SGOT._slope | 2382 | -0.000658123656127784 | 0.0598963682399829 | 77 | -0.0122223386334234 | 0.0503250399310411 |
| Basophils_max | 1841 | 1.08853883758827 | 1.11627490221824 | 78 | 1.51538461538462 | 0.758857124802908 |
| Basophils_median | 1841 | 0.478625746876697 | 0.243576636386292 | 78 | 0.675 | 0.210711534210519 |
| Basophils_min | 1841 | 0.149158066268332 | 0.218220564281621 | 78 | 0.319230769230769 | 0.205797588306198 |
| Basophils_slope | 1780 | -0.000125764657213237 | 0.00136122634354122 | 77 | 9.58055039710486e-05 | 0.0022278482515412 |
| Bicarbonate_max | 2060 | 30.6850485436893 | 3.43297153888046 | 78 | 27.9217948717949 | 2.74570017734447 |
| Bicarbonate_median | 2060 | 26.8749757281553 | 2.41353713159108 | 78 | 24.4891025641026 | 2.20281177217431 |
| Bicarbonate_min | 2060 | 23.1913106796117 | 2.60953480706033 | 78 | 21.1871794871795 | 2.28571844821466 |
| Bicarbonate_slope | 2032 | -0.00101449911256295 | 0.0130995854343925 | 77 | 0.00190032515538861 | 0.00822812361338164 |
| Blood.Urea.Nitrogen..BUN._max | 2412 | 7.36933679519071 | 2.29440859659421 | 78 | 7.71951282051282 | 2.13834432094049 |
| Blood.Urea.Nitrogen..BUN._median | 2412 | 5.59243985074627 | 1.35901812948061 | 78 | 5.75203205128205 | 1.52476657855858 |
| Blood.Urea.Nitrogen..BUN._min | 2412 | 4.183202960199 | 1.35129325674648 | 78 | 4.20466666666667 | 1.29009231187546 |
| Blood.Urea.Nitrogen..BUN._slope | 2386 | 9.27453772868006e-06 | 0.00594527686260093 | 77 | -0.000867295953494294 | 0.00503249957013987 |
| BMI_max | 1363 | 0.00257989787651352 | 0.00043693592222543 | 78 | 0.0026310647019252 | 0.00040534507266651 |
| BMI_median | 1363 | 0.00257726803310333 | 0.000435285567144275 | 78 | 0.0026310647019252 | 0.00040534507266651 |
| BMI_min | 1363 | 0.00257463509610351 | 0.000434026355839315 | 78 | 0.0026310647019252 | 0.00040534507266651 |
| BMI_slope | 242 | 3.49831438107787e-07 | 4.80387777606735e-06 | 0 | NaN | NA |
| bp_diastolic_max | 2176 | 91.9954044117647 | 9.34311361240496 | 78 | 90.5641025641026 | 8.73058475837158 |
| bp_diastolic_median | 2176 | 81.1148897058823 | 7.76115519229348 | 78 | 80.1410256410256 | 7.30099505578033 |
| bp_diastolic_min | 2176 | 69.7720588235294 | 8.99585961407127 | 78 | 67.5512820512821 | 7.31596268602499 |
| bp_diastolic_slope | 2176 | -0.00128472740620271 | 0.0218826468989993 | 78 | -0.00667326941155003 | 0.0291790957823273 |
| bp_systolic_max | 2176 | 147.091452205882 | 16.5452608210706 | 78 | 143.5 | 14.0164560057469 |
| bp_systolic_median | 2176 | 129.504595588235 | 12.8633928425817 | 78 | 125.410256410256 | 11.2295249722431 |
| bp_systolic_min | 2176 | 113.931066176471 | 11.8678094489092 | 78 | 108.923076923077 | 11.1760054090492 |
| bp_systolic_slope | 2176 | -0.00799534171637676 | 0.0342797837462885 | 78 | -0.00965396319905002 | 0.0353097583980406 |
| Calcium_max | 2108 | 2.47509643026565 | 0.185095531617863 | 72 | 2.54430555555556 | 0.107252189168227 |
| Calcium_median | 2108 | 2.34559908206831 | 0.090798733570523 | 72 | 2.41982638888889 | 0.0875812765266859 |
| Calcium_min | 2108 | 2.22246499122391 | 0.177431835802872 | 72 | 2.28527777777778 | 0.10743123869952 |
| Calcium_slope | 2079 | 4.85624997284194e-05 | 0.000387001055673194 | 71 | -1.03281579376726e-05 | 0.000296532203830862 |
| Chloride_max | 2250 | 107.072577777778 | 2.72726774452538 | 78 | 105.24358974359 | 2.03977269828593 |
| Chloride_median | 2250 | 103.427733333333 | 2.40089925349514 | 78 | 102.339743589744 | 1.82246726894805 |
| Chloride_min | 2250 | 99.3092888888889 | 3.52116233627607 | 78 | 99.2179487179487 | 2.94824374707065 |
| Chloride_slope | 2219 | -0.00117245819928419 | 0.0128013752777168 | 77 | -0.00331822886696961 | 0.0109623128378599 |
| CK_max | 1945 | 512.351670951157 | 455.299383403162 | 36 | 1121.77777777778 | 3658.2300100389 |
| CK_median | 1945 | 301.531053984576 | 264.692112745275 | 36 | 264.194444444444 | 244.596819211236 |
| CK_min | 1945 | 170.035526992288 | 159.869237968098 | 36 | 142.055555555556 | 101.857307597988 |
| CK_slope | 1915 | -0.100658972347894 | 0.574678418733238 | 36 | 0.222995689834011 | 1.94438136425215 |
| Creatinine_max | 2412 | 79.3120580431177 | 20.4044964412935 | 78 | 76.72 | 36.889060497871 |
| Creatinine_median | 2412 | 65.7550116086235 | 17.898575703638 | 78 | 57.8228205128205 | 16.8596654538018 |
| Creatinine_min | 2412 | 52.5264943615257 | 18.5329613539573 | 78 | 44.0882051282051 | 17.636321858161 |
| Creatinine_slope | 2385 | -0.0290414087540594 | 0.0433624748291318 | 77 | -0.0441979350586713 | 0.0410528373669304 |
| Eosinophils_max | 1776 | 4.19279279279279 | 2.5296247840395 | 78 | 4.36538461538461 | 2.06591553385103 |
| Eosinophils_median | 1776 | 2.30242117117117 | 1.43340913219548 | 78 | 2.19358974358974 | 1.06180313967605 |
| Eosinophils_min | 1776 | 1.18220720720721 | 1.09648291708696 | 78 | 0.993589743589744 | 0.735613489631074 |
| Eosinophils_slope | 1715 | -0.000276042381163875 | 0.00373969605468944 | 77 | -0.000449842547448364 | 0.00614833883409007 |
| fvc_percent_max | 2257 | 87.7004342422103 | 17.1008822041781 | 64 | 92.9973958333333 | 17.4651704846968 |
| fvc_percent_median | 2257 | 75.5595081800905 | 18.9957503786435 | 64 | 77.640625 | 19.3027540219184 |
| fvc_percent_min | 2257 | 55.197839318849 | 25.2909433628203 | 64 | 54.3828125 | 27.7152578917573 |
| fvc_percent_slope | 2254 | -0.0663665666490335 | 0.19170083274742 | 62 | -0.0816629530015356 | 0.104075541116402 |
| Gamma.glutamyltransferase_max | 1867 | 58.5538296732726 | 79.0328778917009 | 37 | 46.4864864864865 | 31.6057498475107 |
| Gamma.glutamyltransferase_median | 1867 | 34.8623460096411 | 36.0274704864039 | 37 | 28.8918918918919 | 18.2570896180449 |
| Gamma.glutamyltransferase_min | 1867 | 24.3656132833423 | 24.0468871711512 | 37 | 20.2702702702703 | 11.466493818098 |
| Gamma.glutamyltransferase_slope | 1838 | 0.0098813623526445 | 0.143759624959074 | 36 | 0.00607747015560745 | 0.032959323679861 |
| Glucose_max | 2170 | 7.17329677419355 | 2.57277387970439 | 78 | 7.78570512820513 | 2.06115963520333 |
| Glucose_median | 2170 | 5.49431923963134 | 1.2772494555249 | 78 | 5.4673717948718 | 1.06605146691106 |
| Glucose_min | 2170 | 4.26760151612903 | 1.27924583695133 | 78 | 4.35230769230769 | 0.703532687057426 |
| Glucose_slope | 2196 | 0.000515908627398624 | 0.00469970375778664 | 77 | -0.000841185906820127 | 0.00882693606967879 |
| hands_max | 2424 | 6.19884488448845 | 1.94774457814011 | 101 | 5.8019801980198 | 2.16342229802782 |
| hands_median | 2424 | 4.93719059405941 | 2.43224315298442 | 101 | 3.98019801980198 | 2.65981276792109 |
| hands_min | 2424 | 3.11716171617162 | 2.59962266481603 | 101 | 2.21782178217822 | 2.41496981511587 |
| hands_slope | 2424 | -0.00570983606347995 | 0.00516913173845415 | 101 | -0.00587949118702615 | 0.00513066976183382 |
| HbA1c..Glycated.Hemoglobin._max | 1511 | 5.97647915287889 | 0.913463994691757 | NA | NA | NA |
| HbA1c..Glycated.Hemoglobin._median | 1511 | 5.44060886829914 | 0.639825431407871 | NA | NA | NA |
| HbA1c..Glycated.Hemoglobin._min | 1511 | 4.95290536068829 | 0.597971928656614 | NA | NA | NA |
| HbA1c..Glycated.Hemoglobin._slope | 1488 | -4.54496513342089e-05 | 0.00124058323118751 | NA | NA | NA |
| Hematocrit_max | 2224 | 41.9417153776978 | 12.9306948288911 | 78 | 47.7038461538462 | 3.86514367964965 |
| Hematocrit_median | 2224 | 39.4702032374101 | 12.118830542526 | 78 | 43.6935897435897 | 3.15355251104107 |
| Hematocrit_min | 2224 | 36.9653210431655 | 11.5569121896324 | 78 | 40.1410256410256 | 3.35851064306691 |
| Hematocrit_slope | 2194 | 0.000253297128528394 | 0.010010488485032 | 77 | 0.00147936738970224 | 0.0148984249534962 |
| Hemoglobin_max | 2224 | 152.142805755396 | 12.7185245173477 | 78 | 154.692307692308 | 11.7067054736571 |
| Hemoglobin_median | 2224 | 144.288893884892 | 11.6002376323046 | 78 | 144.282051282051 | 10.5123890248755 |
| Hemoglobin_min | 2224 | 135.460922661871 | 14.8872658504018 | 78 | 133.705128205128 | 11.748566062665 |
| Hemoglobin_slope | 2194 | -0.000237468012819504 | 0.0302987113256346 | 77 | -0.0038880080774771 | 0.0419916531816279 |
| leg_max | 2424 | 5.31848184818482 | 2.23395660623283 | 101 | 4.77227722772277 | 2.37436807643134 |
| leg_median | 2424 | 4.07054455445545 | 2.28205481918521 | 101 | 3.16831683168317 | 2.44670924684848 |
| leg_min | 2424 | 2.54001650165017 | 2.14786828635806 | 101 | 1.76237623762376 | 2.10783545302514 |
| leg_slope | 2424 | -0.00508442048414411 | 0.00465417680089854 | 101 | -0.00485356584888292 | 0.00423950162322656 |
| Lymphocytes_max | 1851 | 32.3286331712588 | 7.76821112114374 | 78 | 32.9987179487179 | 7.45470292752137 |
| Lymphocytes_median | 1851 | 25.8169097784981 | 6.31685372094809 | 78 | 25.2858974358974 | 5.92089573688022 |
| Lymphocytes_min | 1851 | 18.9065207995678 | 6.37767534510343 | 78 | 17.3807692307692 | 5.734358911672 |
| Lymphocytes_slope | 1779 | -0.00478133886995476 | 0.0159349899736102 | 77 | -0.00741887936214243 | 0.0385628828700545 |
| Monocytes_max | 1853 | 8.72142471667566 | 2.71222329547321 | 78 | 9.45769230769231 | 4.57663516249541 |
| Monocytes_median | 1853 | 5.96675661090124 | 1.77869519349936 | 78 | 6.56025641025641 | 1.57734280453246 |
| Monocytes_min | 1853 | 3.85895844576363 | 1.85833904941223 | 78 | 4.58717948717949 | 1.32785922567497 |
| Monocytes_slope | 1779 | -0.00184840813557744 | 0.00573745431276936 | 77 | -0.00157236519435078 | 0.0081939387424273 |
| mouth_max | 2424 | 10.7570132013201 | 1.89421875798073 | 101 | 10.5841584158416 | 1.89877500896064 |
| mouth_median | 2424 | 9.72524752475248 | 2.75501942032917 | 101 | 8.56930693069307 | 3.40369630179478 |
| mouth_min | 2424 | 7.84034653465347 | 3.71660194923749 | 101 | 6.32673267326733 | 4.13789538507461 |
| mouth_slope | 2424 | -0.00507728339996061 | 0.00628746488325555 | 101 | -0.00637143284150849 | 0.00636442787700772 |
| Neutrophils_max | 1786 | 73.5036394176932 | 7.59618459232422 | 42 | 74.8547619047619 | 6.58113350682167 |
| Neutrophils_median | 1786 | 65.0837066069429 | 7.3431646930973 | 42 | 64.9535714285714 | 6.04990821193366 |
| Neutrophils_min | 1786 | 57.0599048152296 | 8.91092977206641 | 42 | 56.2595238095238 | 7.81234356405515 |
| Neutrophils_slope | 1714 | 0.00669496389080631 | 0.0189001866113887 | 41 | 0.0144226259889641 | 0.0624384261323121 |
| onset_delta_mean | 2418 | -686.872208436725 | 414.451687020307 | 92 | -530.989130434783 | 231.437480916255 |
| Phosphorus_max | 2108 | 1.3922770256167 | 0.230172265157576 | 36 | 1.41924166666667 | 0.133117508508192 |
| Phosphorus_median | 2108 | 1.19182457305503 | 0.121608040253329 | 36 | 1.21089583333333 | 0.147335204974914 |
| Phosphorus_min | 2108 | 1.02383847248577 | 0.133486006088975 | 36 | 0.999513888888889 | 0.14844392717748 |
| Phosphorus_slope | 2077 | 5.64147967508512e-05 | 0.000425834118538429 | 36 | 0.00014922502619943 | 0.000302480822252272 |
| Platelets_max | 2106 | 285.874169040836 | 72.0559879135821 | 78 | 311.884615384615 | 78.7676149368328 |
| Platelets_median | 2106 | 239.102801519468 | 54.0511304228343 | 78 | 282.666666666667 | 65.079604933919 |
| Platelets_min | 2106 | 208.625449667616 | 50.6622707272523 | 78 | 261.153846153846 | 71.9935200769273 |
| Platelets_slope | 2078 | 0.0186003620328334 | 0.0992053808162082 | 39 | 0.0326726154314932 | 0.0989955472289394 |
| Potassium_max | 2411 | 4.62399834093737 | 1.2773264765901 | 78 | 4.64358974358974 | 0.383622427871686 |
| Potassium_median | 2411 | 4.18821858150145 | 0.248106500529091 | 78 | 4.16923076923077 | 0.216859336962862 |
| Potassium_min | 2411 | 3.85654500207383 | 0.26610617781299 | 78 | 3.77435897435897 | 0.253513440016628 |
| Potassium_slope | 2384 | -0.000426652821675778 | 0.0241506854806198 | 77 | -7.00343910746221e-05 | 0.00115760742615253 |
| Protein_max | 2043 | 76.8311306901615 | 4.25388686052967 | 36 | 74.0277777777778 | 5.07366371491681 |
| Protein_median | 2043 | 72.2857807146353 | 3.56415385033174 | 36 | 69.4583333333333 | 3.93405461506138 |
| Protein_min | 2043 | 68.0037689672051 | 4.13294383815223 | 36 | 64.8333333333333 | 4.46894043050795 |
| Protein_slope | 2079 | -0.00106407224981409 | 0.0107602891424065 | 36 | -0.000844172540810016 | 0.00643223506827082 |
| pulse_max | 2176 | 90.5772058823529 | 11.9819601837299 | 78 | 95.9615384615385 | 13.6728399919897 |
| pulse_median | 2176 | 76.8559283088235 | 9.11825461027748 | 78 | 78.7692307692308 | 9.88133893462111 |
| pulse_min | 2176 | 65.5698529411765 | 8.25689213617804 | 78 | 64.7051282051282 | 8.36366441524781 |
| pulse_slope | 2176 | 0.0109366783402785 | 0.028729134018368 | 78 | 0.0111239035221543 | 0.0284267227223097 |
| Red.Blood.Cells..RBC._max | 1974 | 205526512.31003 | 956239460.280868 | 78 | 5070.51282051282 | 426.321924078956 |
| Red.Blood.Cells..RBC._median | 1974 | 4670.78774062817 | 418.757968545842 | 78 | 4716.47435897436 | 372.235357900968 |
| Red.Blood.Cells..RBC._min | 1974 | 4385.89900202634 | 460.893142792287 | 78 | 4360.12820512821 | 405.262440509413 |
| Red.Blood.Cells..RBC._slope | 2011 | -85160.1917018794 | 549551.166890205 | 77 | 0.227030868857412 | 2.13966702912828 |
| respiratory_max | 2424 | 3.90594059405941 | 0.308467953791843 | 101 | 3.87128712871287 | 0.391493712251772 |
| respiratory_median | 2424 | 3.58415841584158 | 0.618435541384743 | 101 | 3.12871287128713 | 1.15467195632901 |
| respiratory_min | 2424 | 2.79909240924092 | 1.05623124072513 | 101 | 2.03960396039604 | 1.42070962606162 |
| respiratory_slope | 2424 | -0.00148984643845156 | 0.00236948213120654 | 101 | -0.0021931899434463 | 0.00291521931967747 |
| Sodium_max | 2411 | 143.361012028204 | 2.342410285652 | 78 | 143.192307692308 | 2.19833285302006 |
| Sodium_median | 2411 | 140.132061385317 | 1.79426317878301 | 78 | 140.269230769231 | 1.48300394446566 |
| Sodium_min | 2411 | 136.791165491497 | 2.8685634660764 | 78 | 137.448717948718 | 1.95831517764591 |
| Sodium_slope | 2384 | 0.00159240122849962 | 0.0171628406344419 | 77 | 0.000158744398170742 | 0.0072989488116711 |
| Total.Cholesterol_max | 1865 | 6.65718943699732 | 1.21795564118192 | 36 | 6.62690277777778 | 1.17291082634763 |
| Total.Cholesterol_median | 1865 | 5.87440393565684 | 1.03928644879369 | 36 | 5.72589722222222 | 0.9933042446404 |
| Total.Cholesterol_min | 1865 | 5.07456998284182 | 0.992283893885334 | 36 | 4.81631944444444 | 0.939556206125133 |
| Total.Cholesterol_slope | 1835 | -0.000145353277074433 | 0.00200216634153814 | 36 | 2.02647546391879e-05 | 0.00208590906741232 |
| Triglycerides_max | 1703 | 3.13492765120376 | 2.12890397829411 | 36 | 3.31741111111111 | 1.79473945753818 |
| Triglycerides_median | 1703 | 2.01650785965942 | 1.23693073131958 | 36 | 1.92714722222222 | 0.970796243661021 |
| Triglycerides_min | 1703 | 1.31833285613623 | 0.80349613930263 | 36 | 1.27751388888889 | 0.720539952754789 |
| Triglycerides_slope | 1673 | -0.000180328598627944 | 0.00239485561856983 | 36 | -0.000304740547514068 | 0.00197306999278624 |
| trunk_max | 2424 | 6.21452145214521 | 1.72936198599806 | 101 | 5.71287128712871 | 2.05103209952144 |
| trunk_median | 2424 | 4.91542904290429 | 2.1348631125207 | 101 | 3.7970297029703 | 2.53197375363745 |
| trunk_min | 2424 | 3.02062706270627 | 2.36871275208369 | 101 | 2.06930693069307 | 2.22376898864326 |
| trunk_slope | 2424 | -0.00593485685094305 | 0.00479970307092377 | 101 | -0.00585878681037928 | 0.0047303152106807 |
| Urine.Ph_max | 2042 | 6.80551420176298 | 0.971695384711862 | 78 | 7.07692307692308 | 0.674570200234901 |
| Urine.Ph_median | 2042 | 5.68303134182174 | 0.646495674028862 | 78 | 6.19230769230769 | 0.572535765517931 |
| Urine.Ph_min | 2042 | 5.19995102840353 | 0.452955883489195 | 78 | 5.56410256410256 | 0.45839198304858 |
| Urine.Ph_slope | 1827 | 0.000199748777318122 | 0.00588011875915285 | 77 | 0.000867929010297606 | 0.00442099295374208 |
| White.Blood.Cell..WBC._max | 1983 | 8.97351941502774 | 2.55746550076234 | 78 | 9.34179487179487 | 2.93952895934544 |
| White.Blood.Cell..WBC._median | 1983 | 6.95655219364599 | 1.59840051489864 | 78 | 6.78910256410256 | 1.68326421126584 |
| White.Blood.Cell..WBC._min | 1983 | 5.77965708522441 | 1.44307811759807 | 78 | 5.28602564102564 | 1.51386092770271 |
| White.Blood.Cell..WBC._slope | 2011 | 0.000831804155604706 | 0.00602416734953845 | 77 | 0.00147281746308479 | 0.00717243640578036 |
| Bilirubin..total._max | NA | NA | NA | 72 | 12.4413888888889 | 5.98023724282224 |
| Bilirubin..total._median | NA | NA | NA | 72 | 8.13923611111111 | 3.37452246592667 |
| Bilirubin..total._min | NA | NA | NA | 72 | 5.21527777777778 | 2.37266592332856 |
| Bilirubin..total._slope | NA | NA | NA | 77 | -7.9458431254426e-05 | 0.0151282727240631 |
data2<-data2[,order(colnames(data2))]
ID<-which(apply(is.na(data2),1,sum)<136)
data2_1<-data2[ID,]
col_ID<-which(apply(is.na(data2_1),2,sum)>9)
data2_2<-data2_1[,-col_ID]
for(i in 1:ncol(data2_2)){
m<-median(data2_2[,i])
for (j in 1:nrow(data2_2)){
if(is.na(data2_2[j,i])){data2_2[j,i]<-m}
if(is.infinite(data2_2[j,i])){data2_2[j,i]<-m}
}
}
data2_2=data.frame(apply(data2_2,2,fun))
#delete redundant BMI_max, BMI_median
data2_2<-data2_2[,-c(35,36)]
col_2<-colnames(data2_2)
data1<-data1[,order(colnames(data1))]
col_chose<-is.element(colnames(data1),col_2)
data1_1<-data1[,col_chose]
data1_2<-data1_1[,which(apply(is.na(data1_1),2,sum)<400)]
data1_3<-data1_2[which(apply(is.na(data1_2),1,sum)<20),]
for(i in 1:ncol(data1_3)){
m<-median(data1_3[,i])
for (j in 1:nrow(data1_3)){
if(is.na(data1_3[j,i])){data1_3[j,i]<-m}
if(is.infinite(data1_3[j,i])){data1_3[j,i]<-m}
}
}
data1_3<-data.frame(apply(data1_3,2,fun))
data2_2<-data2_2[,is.element(colnames(data2_2),colnames(data1_3))]
data2_2<-data2_2[,order(colnames(data2_2))]
data1_3<-data1_3[,order(colnames(data1_3))]
#Delete Race_mean
write.csv(data2_2[,-85],"complete_testing.csv", row.names = FALSE)
write.csv(data1_3[,-85],"complete_training.csv", row.names = FALSE)
### Gender: Male=2, Female=1
### Onset_site: Bulbar=1, Limb=2, Limb&Bular=3
###Final Training Dataset
df_train <- read.csv("complete_training.csv")
age_final <- df_train$Age_mean #2223 obs
age_stat <- c(min(age_final),median(age_final), max(age_final))
onsetdelta_final <- df_train$onset_delta_mean
onsetdelta_stat <- c(min(onsetdelta_final),median(onsetdelta_final), max(onsetdelta_final))
gender_final <- df_train$Gender_mean
gender_stat <- c(sum(gender_final==2)/2223,sum(gender_final==1)/2223)
onset_final <- df_train$onset_site_mean
onset_stat <- table(onset_final)
###Raw PRO-ACT Dataset
age_raw <- df[df$feature_name == "Age",] #6565 obs
age_raw$feature_value <- as.numeric(age_raw$feature_value)
age_stat <- c(min(age_raw$feature_value),median(age_raw$feature_value), max(age_raw$feature_value))
gender_raw <- df[df$feature_name == "Gender",] #6565 obs
gender_stat <- c(sum(gender_raw$feature_value=="M")/nrow(gender_raw),sum(gender_raw$feature_value=="F")/nrow(gender_raw))
onsetdelta_raw <- df[df$feature_name == "onset_delta",] #4985 obs
patient_with_postive_onsetdelta <- onsetdelta_raw[onsetdelta_raw$feature_value == "184",]
onsetdelta_raw$feature_value <- as.numeric(onsetdelta_raw$feature_value)
onsetdelta_stat <- c(min(onsetdelta_raw$feature_value),median(onsetdelta_raw$feature_value), max(onsetdelta_raw$feature_value))
onset_raw <- df[df$feature_name == "onset_site",] #7351 obs
onset_stat <- table(onset_raw$feature_value)
training <- read.csv("complete_training.csv",header=TRUE)
testing <- read.csv("complete_testing.csv",header=TRUE)
obj<-lm(ALSFRS_slope~.,data=training[,-93])
obj2<-step(obj, trace = FALSE)
y_p<-predict(obj2,testing)
test.y<-testing$ALSFRS_slope
write.csv(y_p,"LR_predict.csv", row.names = FALSE)
set.seed(7788888)
#load datasets
df_train <- read.csv("complete_training.csv")
df_test <- read.csv("complete_testing.csv")
#data type transformation
df_train$onset_site_mean <- as.factor(df_train$onset_site_mean)
df_test$onset_site_mean <- as.factor(df_test$onset_site_mean)
levels(df_train$onset_site_mean)[4] <- "4"
df_train$Gender_mean <- as.factor(df_train$Gender_mean)
df_test$Gender_mean <- as.factor(df_test$Gender_mean)
rf.fit <- train(ALSFRS_slope~., data = df_train[,-93], method = "rf", trControl =
trainControl(method = "cv", number = 5), allowParallel=TRUE)
rf.result <- data.frame(df_test$ALSFRS_slope)
colnames(rf.result)[1] <- "ALSFRS_slope"
rf.result$prediction <- predict(rf.fit, df_test)
write.csv(rf.result,"RF_predict.csv",row.names = FALSE)
#########BART###############
# set_bart_machine_memory(10000)
set.seed(614)
data1<-read.csv('complete_training.csv',header=TRUE)
train.x<-data1[,-c(6,93)]
train.y<-data1[,6]
data2<-read.csv('complete_testing.csv',header=TRUE)
test.x<-data2[,-c(6,93)]
test.y<-data2[,6]
options(java.parameters="-Xmx68000m")
set_bart_machine_num_cores(8)
bart_machine_cv <- bartMachineCV(train.x, train.y, mem_cache_for_speed = FALSE)##CV
summary(bart_machine_cv)
y_p<-bart_predict_for_test_data(bart_machine_cv,test.x,test.y)$y_hat
write.csv(y_p,"bart_predict.csv")
set.seed(23432)
data1<-read.csv("complete_training.csv", header =TRUE)
train.x<-data1[,-c(6,93)]
train.y<-data1[,6]
data2<-read.csv("complete_testing.csv",header=TRUE)
test.x<-data2[,-c(6,93)]
test.y<-data2[,6]
test.x<-test.x[,order(colnames(test.x))]
train.x<-train.x[,order(colnames(train.x))]
# generate Library and run Super Learner
SL.library <- c("SL.svm", "SL.ridge", "SL.randomForest","SL.mean","SL.caret", "SL.rpart","SL.stepAIC","SL.step.forward", "SL.step")
test_predict <- SampleSplitSuperLearner(Y = train.y, X = train.x, newX = test.x, SL.library = SL.library, verbose = FALSE, method = "method.NNLS")
## + Fold01: mtry= 2
## - Fold01: mtry= 2
## + Fold01: mtry=50
## - Fold01: mtry=50
## + Fold01: mtry=98
## - Fold01: mtry=98
## + Fold02: mtry= 2
## - Fold02: mtry= 2
## + Fold02: mtry=50
## - Fold02: mtry=50
## + Fold02: mtry=98
## - Fold02: mtry=98
## + Fold03: mtry= 2
## - Fold03: mtry= 2
## + Fold03: mtry=50
## - Fold03: mtry=50
## + Fold03: mtry=98
## - Fold03: mtry=98
## + Fold04: mtry= 2
## - Fold04: mtry= 2
## + Fold04: mtry=50
## - Fold04: mtry=50
## + Fold04: mtry=98
## - Fold04: mtry=98
## + Fold05: mtry= 2
## - Fold05: mtry= 2
## + Fold05: mtry=50
## - Fold05: mtry=50
## + Fold05: mtry=98
## - Fold05: mtry=98
## + Fold06: mtry= 2
## - Fold06: mtry= 2
## + Fold06: mtry=50
## - Fold06: mtry=50
## + Fold06: mtry=98
## - Fold06: mtry=98
## + Fold07: mtry= 2
## - Fold07: mtry= 2
## + Fold07: mtry=50
## - Fold07: mtry=50
## + Fold07: mtry=98
## - Fold07: mtry=98
## + Fold08: mtry= 2
## - Fold08: mtry= 2
## + Fold08: mtry=50
## - Fold08: mtry=50
## + Fold08: mtry=98
## - Fold08: mtry=98
## + Fold09: mtry= 2
## - Fold09: mtry= 2
## + Fold09: mtry=50
## - Fold09: mtry=50
## + Fold09: mtry=98
## - Fold09: mtry=98
## + Fold10: mtry= 2
## - Fold10: mtry= 2
## + Fold10: mtry=50
## - Fold10: mtry=50
## + Fold10: mtry=98
## - Fold10: mtry=98
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 50 on full training set
## + Fold01: mtry= 2
## - Fold01: mtry= 2
## + Fold01: mtry=50
## - Fold01: mtry=50
## + Fold01: mtry=98
## - Fold01: mtry=98
## + Fold02: mtry= 2
## - Fold02: mtry= 2
## + Fold02: mtry=50
## - Fold02: mtry=50
## + Fold02: mtry=98
## - Fold02: mtry=98
## + Fold03: mtry= 2
## - Fold03: mtry= 2
## + Fold03: mtry=50
## - Fold03: mtry=50
## + Fold03: mtry=98
## - Fold03: mtry=98
## + Fold04: mtry= 2
## - Fold04: mtry= 2
## + Fold04: mtry=50
## - Fold04: mtry=50
## + Fold04: mtry=98
## - Fold04: mtry=98
## + Fold05: mtry= 2
## - Fold05: mtry= 2
## + Fold05: mtry=50
## - Fold05: mtry=50
## + Fold05: mtry=98
## - Fold05: mtry=98
## + Fold06: mtry= 2
## - Fold06: mtry= 2
## + Fold06: mtry=50
## - Fold06: mtry=50
## + Fold06: mtry=98
## - Fold06: mtry=98
## + Fold07: mtry= 2
## - Fold07: mtry= 2
## + Fold07: mtry=50
## - Fold07: mtry=50
## + Fold07: mtry=98
## - Fold07: mtry=98
## + Fold08: mtry= 2
## - Fold08: mtry= 2
## + Fold08: mtry=50
## - Fold08: mtry=50
## + Fold08: mtry=98
## - Fold08: mtry=98
## + Fold09: mtry= 2
## - Fold09: mtry= 2
## + Fold09: mtry=50
## - Fold09: mtry=50
## + Fold09: mtry=98
## - Fold09: mtry=98
## + Fold10: mtry= 2
## - Fold10: mtry= 2
## + Fold10: mtry=50
## - Fold10: mtry=50
## + Fold10: mtry=98
## - Fold10: mtry=98
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 50 on full training set
y_p_sl<-test_predict$SL.predict
write.csv(y_p_sl,"SL_predict.csv",row.names = FALSE)
## measurement LinearRegression RandomForests BART
## 1 R-squared 0.562409822938255 0.6880210876278 0.680649317033897
## 2 RMSE 0.426882827349752 0.360443768743188 0.364677380647518
## 3 Correlation 0.758198331190405 0.831666868731394 0.828442892519017
## SuperLearner
## 1 0.655100925359185
## 2 0.378984036164353
## 3 0.819548257119423
##train_AE.csv obtained by excel
df1<-read.csv("Data/train_AE.csv", header=TRUE, fill = TRUE, stringsAsFactors = FALSE)
ID<-grep("Gastrointestinal",df1$feature_name)
df1$resp<-0
ID1<-intersect(grep("failure",df1$feature_value),grep("Respiratory",df1$feature_name))
ID2<-intersect(grep("Choking",df1$feature_value),grep("Respiratory",df1$feature_name))
ID3<-intersect(grep("insufficiency",df1$feature_value),grep("Respiratory",df1$feature_name))
ID4<-intersect(grep("Orthopnea",df1$feature_value),grep("Respiratory",df1$feature_name))
ID5<-intersect(grep("distress",df1$feature_value),grep("Respiratory",df1$feature_name))
ID6<-intersect(grep("Hoarse voice",df1$feature_value),grep("Respiratory",df1$feature_name))
df1$resp[ID1]<-1
df1$resp[ID2]<-df1$resp[ID2]+1
df1$resp[ID3]<-df1$resp[ID3]+1
df1$resp[ID4]<-df1$resp[ID4]+1
df1$resp[ID5]<-df1$resp[ID5]+1
df1$resp[ID6]<-df1$resp[ID6]+1
df1$infection<-0
ID1<-intersect(grep("upper respiratory",df1$feature_value,ignore.case = TRUE),grep("infection",df1$feature_name,ignore.case = TRUE))
df1$infection[ID1]<-1
df1$muscle<-0
ID1<-intersect(grep("weakness",df1$feature_value,ignore.case = TRUE),grep("muscle",df1$feature_name,ignore.case = TRUE))
df1$muscle[ID1]<-1
df1$other<-0
ID1<-grep("weight decrease",df1$feature_value,ignore.case = TRUE)
df1$other[ID1]<-1
df1$injuries<-0
ID1<-intersect(grep("fall",df1$feature_value,ignore.case = TRUE),grep("injuries",df1$feature_name,ignore.case = TRUE))
df1$injuries[ID1]<-1
df1$move<-0
ID1<-intersect(grep("tremor",df1$feature_value,ignore.case = TRUE),grep("movement",df1$feature_name,ignore.case = TRUE))
df1$move[ID1]<-1
df1$musculoskeletal<-0
ID1<-intersect(grep("chewing",df1$feature_value,ignore.case = TRUE),grep("musculoskeletal",df1$feature_name,ignore.case = TRUE))
df1$musculoskeletal[ID1]<-1
df1$neurological<-0
ID1<-intersect(grep("speech",df1$feature_value,ignore.case = TRUE),grep("neurological",df1$feature_name,ignore.case = TRUE))
df1$neurological[ID1]<-1
df1$neuromuscular<-0
ID1<-intersect(grep("spasticity",df1$feature_value,ignore.case = TRUE),grep("neuromuscular",df1$feature_name,ignore.case = TRUE))
df1$neuromuscular[ID1]<-1
df1$cranial_nerve<-0
ID1<-intersect(grep("paralysis",df1$feature_value,ignore.case = TRUE),grep("cranial nerve",df1$feature_name,ignore.case = TRUE))
df1$cranial_nerve[ID1]<-1
df1$neurological<-0
ID1<-intersect(grep("dysarthria",df1$feature_value,ignore.case = TRUE),grep("neurological",df1$feature_name,ignore.case = TRUE))
df1$neurological[ID1]<-1
df1$peripheral_neuropath<-0
ID1<-intersect(grep("foot drop",df1$feature_value,ignore.case = TRUE),grep("peripheral neuropath",df1$feature_name,ignore.case = TRUE))
df1$peripheral_neuropath[ID1]<-1
df1<-as.data.frame(df1)
df_disease1<-df1[,c(2,6:16)]
aaa1<-rowsum(df_disease1,df_disease1$SubjectID)
aaa1$SubjectID<-as.numeric(rownames(aaa1))
write.csv(aaa1,"AE_mapping_train_wizID.csv", row.names = FALSE)
##test_AE.csv obtained by excel
df1<-read.csv("Data/test_AE.csv", header=TRUE, fill = TRUE, stringsAsFactors = FALSE)
ID<-grep("Gastrointestinal",df1$feature_name)
df1$resp<-0
ID1<-intersect(grep("failure",df1$feature_value),grep("Respiratory",df1$feature_name))
ID2<-intersect(grep("Choking",df1$feature_value),grep("Respiratory",df1$feature_name))
ID3<-intersect(grep("insufficiency",df1$feature_value),grep("Respiratory",df1$feature_name))
ID4<-intersect(grep("Orthopnea",df1$feature_value),grep("Respiratory",df1$feature_name))
ID5<-intersect(grep("distress",df1$feature_value),grep("Respiratory",df1$feature_name))
ID6<-intersect(grep("Hoarse voice",df1$feature_value),grep("Respiratory",df1$feature_name))
df1$resp[ID1]<-1
df1$resp[ID2]<-df1$resp[ID2]+1
df1$resp[ID3]<-df1$resp[ID3]+1
df1$resp[ID4]<-df1$resp[ID4]+1
df1$resp[ID5]<-df1$resp[ID5]+1
df1$resp[ID6]<-df1$resp[ID6]+1
df1$infection<-0
ID1<-intersect(grep("upper respiratory",df1$feature_value,ignore.case = TRUE),grep("infection",df1$feature_name,ignore.case = TRUE))
df1$infection[ID1]<-1
df1$muscle<-0
ID1<-intersect(grep("weakness",df1$feature_value,ignore.case = TRUE),grep("muscle",df1$feature_name,ignore.case = TRUE))
df1$muscle[ID1]<-1
df1$other<-0
ID1<-grep("weight decrease",df1$feature_value,ignore.case = TRUE)
df1$other[ID1]<-1
df1$injuries<-0
ID1<-intersect(grep("fall",df1$feature_value,ignore.case = TRUE),grep("injuries",df1$feature_name,ignore.case = TRUE))
df1$injuries[ID1]<-1
df1$move<-0
ID1<-intersect(grep("tremor",df1$feature_value,ignore.case = TRUE),grep("movement",df1$feature_name,ignore.case = TRUE))
df1$move[ID1]<-1
df1$musculoskeletal<-0
ID1<-intersect(grep("chewing",df1$feature_value,ignore.case = TRUE),grep("musculoskeletal",df1$feature_name,ignore.case = TRUE))
df1$musculoskeletal[ID1]<-1
df1$neurological<-0
ID1<-intersect(grep("speech",df1$feature_value,ignore.case = TRUE),grep("neurological",df1$feature_name,ignore.case = TRUE))
df1$neurological[ID1]<-1
df1$neuromuscular<-0
ID1<-intersect(grep("spasticity",df1$feature_value,ignore.case = TRUE),grep("neuromuscular",df1$feature_name,ignore.case = TRUE))
df1$neuromuscular[ID1]<-1
df1$cranial_nerve<-0
ID1<-intersect(grep("paralysis",df1$feature_value,ignore.case = TRUE),grep("cranial nerve",df1$feature_name,ignore.case = TRUE))
df1$cranial_nerve[ID1]<-1
df1$neurological<-0
ID1<-intersect(grep("dysarthria",df1$feature_value,ignore.case = TRUE),grep("neurological",df1$feature_name,ignore.case = TRUE))
df1$neurological[ID1]<-1
df1$peripheral_neuropath<-0
ID1<-intersect(grep("foot drop",df1$feature_value,ignore.case = TRUE),grep("peripheral neuropath",df1$feature_name,ignore.case = TRUE))
df1$peripheral_neuropath[ID1]<-1
df1<-as.data.frame(df1)
df_disease2<-df1[,c(2,8:18)]
aaa2<-rowsum(df_disease2,df_disease2$SubjectID)
aaa2$SubjectID<-as.numeric(rownames(aaa2))
write.csv(aaa2,"AE_mapping_test_wizID.csv", row.names = FALSE)
training<-read.csv("complete_training.csv",header=TRUE)
testing<-read.csv("complete_testing.csv",header=TRUE)
training_ae<-read.csv("AE_mapping_train_wizID.csv",header=TRUE)
testing_ae<-read.csv("AE_mapping_test_wizID.csv",header=TRUE)
train<-merge(training,training_ae,by="SubjectID")
test<-merge(testing,testing_ae,by="SubjectID")
score_ae<-apply(train[,101:111],1,sum)
train$score_ae<-score_ae
score_ae<-apply(test[,101:111],1,sum)
test$score_ae<-score_ae
write.csv(train,"training_wizAE.csv", row.names = FALSE)
write.csv(test,"testing_wizAE.csv", row.names = FALSE)
df_t <- read.csv("training_wizAE.csv")
df_t$AE <- 1 #1519
df_t[df_t$score_ae == 0,]$AE <- 0 #669
AE_group <- which(df_t$AE==1)
NOAE_group <- which(df_t$AE==0)
FeatureIndex <- setdiff(c(1:113), c(1,48,74,101:113)) #SubjectID(1),Gender_mean(48),onset_site_mean(74), AE(101:113)
df_t2 <- df_t[,-c(1,48,74,101:113)]
wTest.tbl <- data.frame(FeatureIndex, names(df_t)[FeatureIndex], c(rep(NA, length(FeatureIndex))), c(rep(NA, length(FeatureIndex))), c(rep(NA, length(FeatureIndex))), c(rep(NA,length(FeatureIndex))))
colnames(wTest.tbl)[1:6] <- c("Feature.Index","FeatureName","Wilcoxon.pvalue","FDR.corrected.pvalue","W.statistic", "Significance")
#test
for (i in 1:ncol(df_t2)){
t.result <- wilcox.test(df_t[AE_group, i], df_t[NOAE_group, i], paired = FALSE)
wTest.tbl[i,3] <- round(t.result$p.value, digits = 4)
wTest.tbl[i,5] <- round(t.result$statistic)
}
wTest.tbl$FDR.corrected.pvalue <- round(p.adjust(wTest.tbl$Wilcoxon.pvalue, "BH",n = length(wTest.tbl$Wilcoxon.pvalue)), digits=4)
wTest.tbl[wTest.tbl$FDR.corrected.pvalue < (0.05),]$Significance <- "*"
wTest.tbl[wTest.tbl$FDR.corrected.pvalue < (0.01),]$Significance <- "**"
wTest.tbl[wTest.tbl$FDR.corrected.pvalue < (0.001),]$Significance <- "***"
wTest.tbl2 <- wTest.tbl[!is.na(wTest.tbl$Significance), ]
write.csv(wTest.tbl2,"wilcoxonTest_AE.csv", row.names = FALSE)
###########
#Chi-squared test for Gender and onset_site
#Gender
tbl_gender <- table(df_t$AE,df_t$Gender_mean)
chisq.test(tbl_gender) #Not significant p=0.05765
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: tbl_gender
## X-squared = 3.6037, df = 1, p-value = 0.05765
#Onset_site
tbl_site <- table(df_t$AE,df_t$onset_site_mean)
chisq.test(tbl_site) #Significant p=0.001583
##
## Pearson's Chi-squared test
##
## data: tbl_site
## X-squared = 12.897, df = 2, p-value = 0.001583
#load datasets
df_train <- read.csv("training_wizAE.csv")
df_test <- read.csv("testing_wizAE.csv")
#remove unnecessary columns
df_train <- df_train[,-1]
df_test <- df_test[,-1]
#data type transformation?(onset_site_mean, Gender_mean)
df_train$onset_site_mean <- as.factor(df_train$onset_site_mean)
df_test$onset_site_mean <- as.factor(df_test$onset_site_mean)
levels(df_train$onset_site_mean)[4] <- "4"
df_train$Gender_mean <- as.factor(df_train$Gender_mean)
df_test$Gender_mean <- as.factor(df_test$Gender_mean)
################RandomForests#######################
set.seed(49239)
#without AE
rf.fit1 <- train(ALSFRS_slope~., data = df_train[,-c(100:111)], method = "rf", trControl =
trainControl(method = "cv", number = 5), allowParallel=TRUE, importance = TRUE)
#with AE
rf.fit2 <- train(ALSFRS_slope~., data = df_train, method = "rf", trControl =
trainControl(method = "cv", number = 5), allowParallel=TRUE, importance = TRUE)
#obtain importance measures of features in the fitted models
rf_noAE.ImpMeasure <- data.frame(varImp(rf.fit1, scale = FALSE)$importance)
rf_noAE.ImpMeasure$Vars <- row.names(rf_noAE.ImpMeasure)
rf_noAE.ImpMeasure <- rf_noAE.ImpMeasure[order(-rf_noAE.ImpMeasure$Overall),]
rf_AE.ImpMeasure <- data.frame(varImp(rf.fit2,scale = FALSE)$importance)
rf_AE.ImpMeasure$Vars <- row.names(rf_AE.ImpMeasure)
rf_AE.ImpMeasure <- rf_AE.ImpMeasure[order(-rf_AE.ImpMeasure$Overall),]
#output
write.csv(rf_noAE.ImpMeasure, "RF_ImpMeasure_withoutAE.csv", row.names = FALSE)
write.csv(rf_AE.ImpMeasure, "RF_ImpMeasure_withAE.csv", row.names = FALSE)
options(java.parameters="-Xmx68000m")
library(bartMachine)
set.seed(614)
#without AE
data1<-read.csv('training_wizAE.csv',header=TRUE)
train.x<-data1[,-c(1,7,101:112)]
train.y<-data1[,7]
data2<-read.csv('testing_wizAE.csv',header=TRUE)
test.x<-data2[,-c(1,7,101:112)]
test.y<-data2[,7]
set_bart_machine_num_cores(8)
bart_machine_cv <- bartMachineCV(train.x, train.y, mem_cache_for_speed = FALSE)##CV
aaa1<-investigate_var_importance(bart_machine_cv,type="splits", plot=FALSE, num_replicates_for_avg=5,num_trees_bottleneck=20)
score1<-aaa1$avg_var_props
write.csv(score1,"BART_ImpMeasure_withoutAE.csv")
#with AE
data1<-read.csv('training_wizAE.csv',header=TRUE)
train.x<-data1[,-c(1,7)]
train.y<-data1[,7]
data2<-read.csv('testing_wizAE.csv',header=TRUE)
test.x<-data2[,-c(1,7)]
test.y<-data2[,7]
set_bart_machine_num_cores(8)
bart_machine_cv <- bartMachineCV(train.x, train.y, mem_cache_for_speed = FALSE)##CV
aaa2<-investigate_var_importance(bart_machine_cv,type="splits", plot=FALSE, num_replicates_for_avg=5,num_trees_bottleneck=20)
score2<-aaa2$avg_var_props
write.csv(score2,"BART_ImpMeasure_withAE.csv")
#obtain subjects with BMI obs and retain those with >=2 obs(as we want to see the trend)
df_bmi<-df[df$feature_name=="BMI",]
df_bmi$feature_value<-as.numeric(df_bmi$feature_value)
df_bmi$feature_delta<-as.numeric(df_bmi$feature_delta)
sID<-as.data.frame(table(df_bmi$SubjectID))
ID<-as.numeric(as.character(sID$Var1[sID$Freq>1]))
df_bmi_1<-df_bmi[df_bmi$SubjectID %in% ID, ]
#obatin subjectID information(with or without Riluzole treatment)
riluzole_yes <- df[df$feature_name=="if_use_Riluzole" & df$feature_value == "Yes", ]
riluzole_no <- df[df$feature_name=="if_use_Riluzole" & df$feature_value == "No", ]
#keep the study subjects with both BMI(n>1) and Riluzole treatment information
bmi_riluzole <- df_bmi_1[df_bmi_1$SubjectID %in% riluzole_yes$SubjectID, ] #Number of Patients=252, obs=737
bmi_no_riluzole <- df_bmi_1[df_bmi_1$SubjectID %in% riluzole_no$SubjectID, ] #Number of Patients=45, obs=311
#Wilcoxon-Mann-Whitney test
bmi_mean_riluzole <- ddply(bmi_riluzole,~SubjectID,summarise,mean=mean(feature_value))
bmi_mean_noriluzole <- ddply(bmi_no_riluzole,~SubjectID,summarise,mean=mean(feature_value))
t.result <- wilcox.test(bmi_mean_riluzole$mean,bmi_mean_noriluzole$mean)#W = 3027, p-value = 6.373e-07
png("BMI with Riluzole.png", width=6000, height= 4000, res = 300)
g1 <- ggplot(bmi_riluzole[bmi_riluzole$feature_delta<300,], aes(feature_delta, feature_value*10000)) + geom_point()
g1 <- g1 + geom_smooth(method = "loess")
g1 <- g1 + xlab("Days") + ylab("BMI") + ggtitle("BMI for Patients with Riluzole Treatment")
g1 <- g1 + theme(plot.title = element_text(size=45, face = "bold"), axis.text.x = element_text(size=16),axis.text.y =element_text(size=16),axis.title=element_text(size=30, face = "bold"))
g1 <- g1 + scale_x_continuous(limits = c(0, 250)) + scale_y_continuous(limits = c(15, 45))
print(g1)
dev.off()
## quartz_off_screen
## 2
png("BMI without Riluzole.png", width=6000, height= 4000, res = 300)
g2 <- ggplot(bmi_no_riluzole, aes(feature_delta, feature_value*10000)) + geom_point()
g2 <- g2 + geom_smooth(method = "loess")
g2 <- g2 + xlab("Days") + ylab("BMI") + ggtitle("BMI for Patients without Riluzole Treatment")
g2 <- g2 + theme(plot.title = element_text(size=45, face = "bold"), axis.text.x = element_text(size=16),axis.text.y =element_text(size=16),axis.title=element_text(size=30, face = "bold"))
g2 <- g2 + scale_x_continuous(limits = c(0, 250)) + scale_y_continuous(limits = c(15, 45))
print(g2)
dev.off()
## quartz_off_screen
## 2
#### Density plots of 33 features comparing patients with AEs and patients without AEs(Table S2, The results are not shown in this markdown)####
## [1] 5
## [1] "Albumin_min"
## [1] 8
## [1] "ALSFRS_Total_max"
## [1] 11
## [1] "ALSFRS_Total_slope"
## [1] 12
## [1] "ALT.SGPT._max"
## [1] 13
## [1] "ALT.SGPT._median"
## [1] 17
## [1] "AST.SGOT._median"
## [1] 20
## [1] "Bicarbonate_max"
## [1] 21
## [1] "Bicarbonate_median"
## [1] 22
## [1] "Bicarbonate_min"
## [1] 23
## [1] "Bicarbonate_slope"
## [1] 33
## [1] "bp_systolic_median"
## [1] 34
## [1] "bp_systolic_min"
## [1] 35
## [1] "bp_systolic_slope"
## [1] 38
## [1] "Calcium_min"
## [1] 39
## [1] "Calcium_slope"
## [1] 40
## [1] "Chloride_max"
## [1] 49
## [1] "Glucose_max"
## [1] 51
## [1] "Glucose_min"
## [1] 52
## [1] "Glucose_slope"
## [1] 59
## [1] "Hematocrit_min"
## [1] 70
## [1] "mouth_median"
## [1] 71
## [1] "mouth_min"
## [1] 72
## [1] "mouth_slope"
## [1] 73
## [1] "onset_delta_mean"
## [1] 75
## [1] "Platelets_max"
## [1] 77
## [1] "Platelets_min"
## [1] 84
## [1] "pulse_min"
## [1] 85
## [1] "pulse_slope"
## [1] 90
## [1] "Sodium_max"
## [1] 91
## [1] "Sodium_median"
## [1] 92
## [1] "Sodium_min"
## [1] 95
## [1] "trunk_median"
## [1] 96
## [1] "trunk_min"